Skip to main content
Version: 1.0.4

Multi-class Classification using Vowpal Wabbit

Read dataset

import pyspark.sql.types as T
from pyspark.sql import functions as F

schema = T.StructType(
[
T.StructField("sepal_length", T.DoubleType(), False),
T.StructField("sepal_width", T.DoubleType(), False),
T.StructField("petal_length", T.DoubleType(), False),
T.StructField("petal_width", T.DoubleType(), False),
T.StructField("variety", T.StringType(), False),
]
)

df = (
spark.read.format("csv")
.option("header", True)
.schema(schema)
.load("wasbs://publicwasb@mmlspark.blob.core.windows.net/iris.txt")
)
# print dataset basic info
print("records read: " + str(df.count()))
print("Schema: ")
df.printSchema()
display(df)

Use VowpalWabbitFeaturizer to convert data features into vector

from pyspark.ml.feature import StringIndexer

from synapse.ml.vw import VowpalWabbitFeaturizer

indexer = StringIndexer(inputCol="variety", outputCol="label")
featurizer = VowpalWabbitFeaturizer(
inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
outputCol="features",
)

# label needs to be integer (0 to n)
df_label = indexer.fit(df).transform(df).withColumn("label", F.col("label").cast("int"))

# featurize data
df_featurized = featurizer.transform(df_label).select("label", "features")

display(df_featurized)

Split the dataset into train and test

train, test = df_featurized.randomSplit([0.8, 0.2], seed=1)

Model Training

from synapse.ml.vw import VowpalWabbitClassifier


model = (
VowpalWabbitClassifier(
numPasses=5,
passThroughArgs="--holdout_off --oaa 3 --holdout_off --loss_function=logistic --indexing 0 -q ::",
)
.setNumClasses(3)
.fit(train)
)

Model Prediction

predictions = model.transform(test)

display(predictions)